{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from nltk.corpus import names\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])\n",
    "random.shuffle(names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7944"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Raynell', 'female'),\n",
       " ('Stevena', 'female'),\n",
       " ('Candida', 'female'),\n",
       " ('Astrid', 'female'),\n",
       " ('Sal', 'male'),\n",
       " ('Piet', 'male'),\n",
       " ('Oran', 'male'),\n",
       " ('Raoul', 'male'),\n",
       " ('Zalman', 'male'),\n",
       " ('Rosemarie', 'female')]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "names[0:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def gender_features(word):\n",
    "    return {'last_letter': word[-1]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[({'last_letter': 'l'}, 'female'),\n",
       " ({'last_letter': 'a'}, 'female'),\n",
       " ({'last_letter': 'a'}, 'female'),\n",
       " ({'last_letter': 'd'}, 'female'),\n",
       " ({'last_letter': 'l'}, 'male'),\n",
       " ({'last_letter': 't'}, 'male'),\n",
       " ({'last_letter': 'n'}, 'male'),\n",
       " ({'last_letter': 'l'}, 'male'),\n",
       " ({'last_letter': 'n'}, 'male'),\n",
       " ({'last_letter': 'e'}, 'female')]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "featuresets = [(gender_features(n), g) for (n, g) in names]\n",
    "featuresets[0:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_set, test_set = featuresets[500:], featuresets[:500]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'female'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from nltk import NaiveBayesClassifier\n",
    "nb_classifier = NaiveBayesClassifier.train(train_set)\n",
    "nb_classifier.classify(gender_features('Gary'))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.792"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from nltk import classify\n",
    "classify.accuracy(nb_classifier, test_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Most Informative Features\n",
      "             last_letter = 'k'              male : female =     45.1 : 1.0\n",
      "             last_letter = 'a'            female : male   =     34.4 : 1.0\n",
      "             last_letter = 'f'              male : female =     16.6 : 1.0\n",
      "             last_letter = 'p'              male : female =     11.9 : 1.0\n",
      "             last_letter = 'v'              male : female =     10.5 : 1.0\n"
     ]
    }
   ],
   "source": [
    "nb_classifier.show_most_informative_features(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
